In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#set background color grey
sns.set_theme(style="darkgrid")
In [ ]:
df = pd.read_csv("manual_turns.csv")
df.drop(columns=['Unnamed: 0'], inplace=True)
df['turn_duration'] = 0.2*(df['end_idx'].astype('float') - df['start_idx'].astype('float'))
df.describe().T
Out[ ]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| participant_id | 145.0 | 1813.000000 | 643.813616 | 407.000000 | 2102.000000 | 2105.000000 | 2108.000000 | 2111.000000 |
| path_num | 145.0 | 2.303448 | 0.719994 | 1.000000 | 2.000000 | 2.000000 | 3.000000 | 3.000000 |
| turn_num | 145.0 | 2.027586 | 1.301414 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 7.000000 |
| start_idx | 145.0 | 520.820690 | 432.977140 | 45.000000 | 245.000000 | 393.000000 | 649.000000 | 2124.000000 |
| end_idx | 145.0 | 585.337931 | 446.037869 | 84.000000 | 289.000000 | 477.000000 | 713.000000 | 2273.000000 |
| walking_direction_lag | 145.0 | -4.862069 | 28.262809 | -164.000000 | -14.000000 | 0.000000 | 7.000000 | 80.000000 |
| walking_direction_base_corr | 145.0 | 0.089897 | 0.451317 | -0.857578 | -0.326919 | 0.210051 | 0.448113 | 0.866582 |
| walking_direction_lagged_corr | 145.0 | 0.488734 | 0.156062 | 0.178460 | 0.382983 | 0.467089 | 0.607509 | 0.910992 |
| walking_direction_dtw | 145.0 | 56.049318 | 34.593451 | 3.665118 | 29.714181 | 46.373857 | 73.494923 | 191.612370 |
| speeds_lag | 145.0 | -0.434483 | 17.279489 | -49.000000 | -7.000000 | -1.000000 | 4.000000 | 92.000000 |
| speeds_base_corr | 145.0 | 0.247639 | 0.331154 | -0.663930 | 0.040225 | 0.290227 | 0.516385 | 0.836185 |
| speeds_lagged_corr | 145.0 | 0.507480 | 0.138251 | 0.184940 | 0.425878 | 0.493722 | 0.605422 | 0.850110 |
| speeds_dtw | 145.0 | 40.597031 | 19.204629 | 10.222585 | 26.853865 | 37.784558 | 50.858723 | 140.600285 |
| mean_distance | 145.0 | 2.313964 | 1.488589 | 0.336612 | 1.348608 | 2.010133 | 3.021977 | 13.639054 |
| mean_speed_difference | 145.0 | 0.364955 | 0.123349 | 0.137744 | 0.278000 | 0.340217 | 0.436231 | 0.797638 |
| mean_walking_direction_difference | 145.0 | 58.546456 | 16.733966 | 18.568244 | 46.072398 | 58.550238 | 68.860939 | 100.850062 |
| mean_pace_asymmetry | 145.0 | 0.410603 | 0.112106 | 0.096129 | 0.342006 | 0.395674 | 0.467422 | 0.870018 |
| turn_duration | 145.0 | 12.903448 | 6.792034 | 3.600000 | 7.800000 | 11.800000 | 15.400000 | 50.000000 |
In [ ]:
# Count for 'speeds_lag'
speeds_lag_counts = df['speeds_lag'].apply(lambda x: 'Negative' if x < 0 else ('Zero' if x == 0 else 'Positive')).value_counts()
# Count for 'walking_speed_lag'
walking_speed_lag_counts = df['walking_direction_lag'].apply(lambda x: 'Negative' if x < 0 else ('Zero' if x == 0 else 'Positive')).value_counts()
speeds_lag_counts.plot.bar()
plt.title("Counts for 'speeds_lag'")
plt.xlabel("Speeds Lag Category")
plt.ylabel("Count")
plt.show()
walking_speed_lag_counts.plot.bar()
plt.title("Counts for 'walking_direction_lag'")
plt.xlabel("Walking Direction Lag Category")
plt.ylabel("Count")
plt.show()
In [ ]:
# box plot turn duration
sns.boxplot(x=df['turn_duration'])
plt.title('Box plot of turn duration')
plt.show()
In [ ]:
df['normalized_walking_direction_dtw'] = df['walking_direction_dtw'] / (df['turn_duration'] / 0.2)
df['normalized_speeds_dtw'] = df['speeds_dtw'] / (df['turn_duration'] / 0.2)
In [ ]:
df['abs_walking_direction_lag'] = df['walking_direction_lag'].abs()
df['abs_speeds_lag'] = df['speeds_lag'].abs()
In [ ]:
relevant_features = [
'turn_duration',
'mean_distance',
'mean_pace_asymmetry',
'walking_direction_lag',
'abs_walking_direction_lag',
'walking_direction_dtw',
'normalized_walking_direction_dtw',
# 'walking_direction_base_corr',
'walking_direction_lagged_corr',
# 'mean_walking_direction_difference',
'speeds_lag',
'abs_speeds_lag',
'speeds_dtw',
'normalized_speeds_dtw',
# 'speeds_base_corr',
'speeds_lagged_corr',
# 'mean_speed_difference',
]
In [ ]:
corr = df[relevant_features].corr(method='pearson',numeric_only=True)
mask = np.abs(corr) < 0.3
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt=".2f", mask=mask)
# make it bigger
plt.title(f"Metrics Correlation Matrix - Original Data (n={len(df)})")
plt.show()
In [ ]:
# find rows with the same participant_id and path_num with overlapping start_idx and end_idx
df['overlapping'] = False
for index, row in df.iterrows():
if len(df[(df['participant_id'] == row['participant_id']) & (df['path_num'] == row['path_num']) & (df['start_idx'] >= row['start_idx']) & (df['start_idx'] <= row['end_idx'])]) > 1 or \
len(df[(df['participant_id'] == row['participant_id']) & (df['path_num'] == row['path_num']) & (df['end_idx'] >= row['start_idx']) & (df['end_idx'] <= row['end_idx'])]) > 1:
df.at[index, 'overlapping'] = True
overlapping_and_not_subset = df[df['overlapping'] == True]
# overlapping_and_not_subset = df
oans = overlapping_and_not_subset
corr_oans = oans[relevant_features].corr(method='pearson',numeric_only=True)
mask = np.abs(corr_oans) < 0.3
plt.figure(figsize=(12, 10))
sns.heatmap(corr_oans, annot=True, fmt=".2f", mask=mask)
plt.title(f"Metrics Correlation Matrix - Overlapping Data (n={len(oans)})")
plt.show()
In [ ]:
threshold = 0.3
filtered_df = df[(df['walking_direction_lagged_corr'] > threshold) & (df['speeds_lagged_corr'] > threshold)]
filtered_oans = filtered_df[filtered_df['overlapping'] == True]
# filtered_oans = filtered_df
corr_filtered_oans = filtered_oans[relevant_features].corr(method='pearson',numeric_only=True)
mask = (np.abs(corr_filtered_oans) < 0.3)
plt.figure(figsize=(12, 10))
sns.heatmap(corr_filtered_oans, annot=True, fmt=".2f", mask=mask)
plt.title(f"Metrics Correlation Matrix - Filtered Overlapping Data (n={len(filtered_oans)})")
plt.show()
In [ ]:
filtered_oans.describe().T
Out[ ]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| participant_id | 85.0 | 1885.729412 | 573.373276 | 407.000000 | 2102.000000 | 2105.000000 | 2107.000000 | 2111.000000 |
| path_num | 85.0 | 2.258824 | 0.742469 | 1.000000 | 2.000000 | 2.000000 | 3.000000 | 3.000000 |
| turn_num | 85.0 | 1.905882 | 1.436128 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 7.000000 |
| start_idx | 85.0 | 486.105882 | 471.125451 | 45.000000 | 170.000000 | 372.000000 | 558.000000 | 2124.000000 |
| end_idx | 85.0 | 552.423529 | 488.578974 | 84.000000 | 218.000000 | 431.000000 | 652.000000 | 2273.000000 |
| walking_direction_lag | 85.0 | -6.494118 | 23.732449 | -101.000000 | -14.000000 | -3.000000 | 2.000000 | 46.000000 |
| walking_direction_base_corr | 85.0 | 0.107917 | 0.459778 | -0.857578 | -0.333389 | 0.254710 | 0.463901 | 0.864534 |
| walking_direction_lagged_corr | 85.0 | 0.510698 | 0.139671 | 0.301173 | 0.394259 | 0.473310 | 0.612160 | 0.910992 |
| walking_direction_dtw | 85.0 | 56.276960 | 33.142225 | 3.665118 | 29.264177 | 48.481499 | 74.596861 | 155.080360 |
| speeds_lag | 85.0 | -0.847059 | 17.121767 | -48.000000 | -7.000000 | -1.000000 | 2.000000 | 92.000000 |
| speeds_base_corr | 85.0 | 0.300209 | 0.329221 | -0.632908 | 0.167255 | 0.380213 | 0.531561 | 0.836185 |
| speeds_lagged_corr | 85.0 | 0.541314 | 0.121374 | 0.307027 | 0.451785 | 0.532035 | 0.620851 | 0.836185 |
| speeds_dtw | 85.0 | 39.709762 | 17.468461 | 10.673369 | 28.068922 | 38.937636 | 48.258955 | 84.791303 |
| mean_distance | 85.0 | 2.140396 | 0.969318 | 0.515054 | 1.262167 | 1.924040 | 2.837807 | 4.393454 |
| mean_speed_difference | 85.0 | 0.352921 | 0.111447 | 0.137744 | 0.273464 | 0.342893 | 0.426791 | 0.741000 |
| mean_walking_direction_difference | 85.0 | 57.025789 | 16.736801 | 18.568244 | 45.656283 | 57.769570 | 68.023336 | 95.338484 |
| mean_pace_asymmetry | 85.0 | 0.394076 | 0.088883 | 0.173147 | 0.333504 | 0.380928 | 0.446291 | 0.662044 |
| turn_duration | 85.0 | 13.263529 | 5.826312 | 3.600000 | 9.000000 | 12.800000 | 16.600000 | 29.800000 |
| normalized_walking_direction_dtw | 85.0 | 0.856659 | 0.389172 | 0.203618 | 0.598522 | 0.751000 | 1.097587 | 1.960544 |
| normalized_speeds_dtw | 85.0 | 0.627214 | 0.186108 | 0.256842 | 0.511719 | 0.611162 | 0.723188 | 1.187326 |
| abs_walking_direction_lag | 85.0 | 16.517647 | 18.161060 | 0.000000 | 3.000000 | 11.000000 | 26.000000 | 101.000000 |
| abs_speeds_lag | 85.0 | 9.576471 | 14.180282 | 0.000000 | 2.000000 | 4.000000 | 11.000000 | 92.000000 |
In [ ]:
from scipy.stats import pearsonr
for feature in relevant_features:
to_display = []
for feature2 in relevant_features:
if not feature.startswith(feature2) and not feature2.startswith(feature) and not feature.endswith(feature2) and not feature2.endswith(feature)\
and np.abs(corr_filtered_oans.loc[feature, feature2]) > 0.3:
to_display.append(feature2)
if len(to_display) == 0:
continue
# set plot grid of 1xlen(to_display)
fig, axs = plt.subplots(int(np.ceil(len(to_display)/3)), min(len(to_display),3), figsize=(5*min(len(to_display),3),5*int(np.ceil(len(to_display)/3))))
# print(axs.shape)
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature2 in enumerate(to_display):
peares = pearsonr(filtered_oans[feature], filtered_oans[feature2], alternative='two-sided')
pcorr, p_val = peares.statistic, peares.pvalue
CI = peares.confidence_interval(confidence_level=0.95)
# Scatter plot
sns.scatterplot(x=feature, y=feature2, data=filtered_oans, ax=axs[int(np.ceil(i/3))-1,i%3])
# Regression line
sns.regplot(x=feature, y=feature2, data=filtered_oans, scatter=False, line_kws={'color': 'red'}, ax=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(f"compared with {feature2}\ncorr: {round(corr_filtered_oans.loc[feature, feature2], 3)}, p_val: {round(p_val,5)}, CI: {[round(c,3) for c in CI]}", fontweight='bold')
# add title "feature vs correlated features" to the plot
fig.suptitle(f"{feature}'s correlations", fontweight='bold')
plt.tight_layout()
plt.show()
In [ ]:
from PIL import Image
import seaborn as sns
from scipy import stats
# for each feature, find highest and lowest valued row and display them
for feature in relevant_features:
# print(f"Feature: {feature}")
# print("Highest values:")
h_res = filtered_oans.loc[filtered_oans[feature].nlargest(1).index, ['participant_id', 'person_robot', 'path_num', 'turn_num', feature]]
# print(h_res)
h_base_path = f"./turns/{h_res['participant_id'].values[0]}/{h_res['person_robot'].values[0]}/run_{h_res['path_num'].values[0]}/turn_{h_res['turn_num'].values[0]}/"
fig, axs = plt.subplots(1, 4, figsize=(20, 5))
paths_img = Image.open(h_base_path + "paths.png")
axs[0].imshow(paths_img)
axs[0].axis('off')
distance_img = Image.open(h_base_path + "distance.png")
axs[1].imshow(distance_img)
axs[1].axis('off')
walking_directions_img = Image.open(h_base_path + "walking_directions.png")
axs[2].imshow(walking_directions_img)
axs[2].axis('off')
speeds_img = Image.open(h_base_path + "speeds.png")
axs[3].imshow(speeds_img)
axs[3].axis('off')
to_print_dict = {k: round(v_val, 3) if isinstance(v_val, float) else v_val for k,v in h_res.to_dict().items() for v_key, v_val in v.items()}
to_print_str = ", ".join([f"{k}: {v}" for k,v in to_print_dict.items()])
fig.suptitle(f"{feature} - highest value\n {to_print_str}", fontweight='bold')
plt.tight_layout()
plt.show()
# print("Lowest values:")
l_res = filtered_oans.loc[filtered_oans[feature].nsmallest(1).index, ['participant_id', 'person_robot', 'path_num', 'turn_num', feature]]
# print(l_res)
l_base_path = f"./turns/{l_res['participant_id'].values[0]}/{l_res['person_robot'].values[0]}/run_{l_res['path_num'].values[0]}/turn_{l_res['turn_num'].values[0]}/"
fig, axs = plt.subplots(1, 4, figsize=(20, 5))
paths_img = Image.open(l_base_path + "paths.png")
axs[0].imshow(paths_img)
axs[0].axis('off')
distance_img = Image.open(l_base_path + "distance.png")
axs[1].imshow(distance_img)
axs[1].axis('off')
walking_directions_img = Image.open(l_base_path + "walking_directions.png")
axs[2].imshow(walking_directions_img)
axs[2].axis('off')
speeds_img = Image.open(l_base_path + "speeds.png")
axs[3].imshow(speeds_img)
axs[3].axis('off')
to_print_dict = {k: round(v_val, 3) if isinstance(v_val, float) else v_val for k,v in l_res.to_dict().items() for v_key, v_val in v.items()}
to_print_str = ", ".join([f"{k}: {v}" for k,v in to_print_dict.items()])
fig.suptitle(f"{feature} - lowest value\n {to_print_str}", fontweight='bold')
plt.tight_layout()
plt.show()
print("\n\n")
In [ ]:
fig, axs = plt.subplots(int(np.ceil(len(relevant_features)/3)), min(len(relevant_features),3), figsize=(15, 5*int(np.ceil(len(relevant_features)/3))))
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature in enumerate(relevant_features):
sns.histplot(data=filtered_oans, x=feature, kde=True, ax=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(feature)
axs[int(np.ceil(i/3))-1,i%3].set_xlabel('')
axs[int(np.ceil(i/3))-1,i%3].set_ylabel('')
plt.tight_layout()
plt.show()
In [ ]:
import scipy.stats as stats
fig, axs = plt.subplots(int(np.ceil(len(relevant_features)/3)), min(len(relevant_features),3), figsize=(15, 5*int(np.ceil(len(relevant_features)/3))))
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature in enumerate(relevant_features):
stats.probplot(filtered_oans[feature], dist="norm", plot=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(feature)
axs[int(np.ceil(i/3))-1,i%3].set_xlabel('Theoretical Quantiles')
axs[int(np.ceil(i/3))-1,i%3].set_ylabel('Ordered Values')
plt.tight_layout()
plt.show()
In [ ]:
fig, axs = plt.subplots(int(np.ceil(len(relevant_features)/3)), min(len(relevant_features),3), figsize=(15, 5*int(np.ceil(len(relevant_features)/3))))
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature in enumerate(relevant_features):
sns.boxplot(data=filtered_oans, y=feature, ax=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(feature)
axs[int(np.ceil(i/3))-1,i%3].set_xlabel('')
axs[int(np.ceil(i/3))-1,i%3].set_ylabel('')
plt.tight_layout()
plt.show()
In [ ]:
# Count for 'speeds_lag'
speeds_lag_counts = filtered_oans['speeds_lag'].apply(lambda x: 'Negative' if x < 0 else ('Zero' if x == 0 else 'Positive')).value_counts()
# Count for 'walking_speed_lag'
walking_speed_lag_counts = filtered_oans['walking_direction_lag'].apply(lambda x: 'Negative' if x < 0 else ('Zero' if x == 0 else 'Positive')).value_counts()
speeds_lag_counts.plot.bar()
plt.title("Counts for 'speeds_lag'")
plt.xlabel("Speeds Lag Category")
plt.ylabel("Count")
plt.show()
walking_speed_lag_counts.plot.bar()
plt.title("Counts for 'walking_direction_lag'")
plt.xlabel("Walking Direction Lag Category")
plt.ylabel("Count")
plt.show()